knitr::opts_chunk$set(echo = TRUE)
This RMarkdown shows how to read in the final project data. It also
shows how to calculate the logit-transformed response and setup the
binary outcome for use with caret or
tidymodels. It also demonstrates how to fit a simple model
(with lm()), save that model, and load it back into the
workspace. You may find these actions helpful as you work through the
project.
You must download the data from Canvas and save the data in the same directory as this RMarkdown file.
This example uses the tidyverse suite of packages.
library(tidyverse)
## ── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
## ✔ dplyr 1.1.3 ✔ readr 2.1.4
## ✔ forcats 1.0.0 ✔ stringr 1.5.0
## ✔ ggplot2 3.4.3 ✔ tibble 3.2.1
## ✔ lubridate 1.9.2 ✔ tidyr 1.3.0
## ✔ purrr 1.0.2
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag() masks stats::lag()
## ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors
library(cowplot)
##
## Attaching package: 'cowplot'
##
## The following object is masked from 'package:lubridate':
##
## stamp
library(ggplot2)
Please download the final project data from Canvas. If this Rmarkdown file is located in the same directory as the downloaded CSV file, it will be able to load in the data for you. It is highly recommended that you use an RStudio RProject to easily manage the working directory and file paths of the code and objects associated with the final project.
The code chunk below reads in the final project data.
df <- readr::read_csv("paint_project_train_data.csv", col_names = TRUE)
## Rows: 835 Columns: 8
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## chr (2): Lightness, Saturation
## dbl (6): R, G, B, Hue, response, outcome
##
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
The readr::read_csv() function displays the data types
and column names associated with the data. However, a glimpse is shown
below that reveals the number of rows and also shows some of the
representative values for the columns.
df %>% glimpse()
## Rows: 835
## Columns: 8
## $ R <dbl> 172, 26, 172, 28, 170, 175, 90, 194, 171, 122, 0, 88, 144, …
## $ G <dbl> 58, 88, 94, 87, 66, 89, 78, 106, 68, 151, 121, 140, 82, 163…
## $ B <dbl> 62, 151, 58, 152, 58, 65, 136, 53, 107, 59, 88, 58, 132, 50…
## $ Lightness <chr> "dark", "dark", "dark", "dark", "dark", "dark", "dark", "da…
## $ Saturation <chr> "bright", "bright", "bright", "bright", "bright", "bright",…
## $ Hue <dbl> 4, 31, 8, 32, 5, 6, 34, 10, 1, 21, 24, 22, 36, 16, 26, 12, …
## $ response <dbl> 12, 10, 16, 10, 11, 16, 10, 19, 14, 25, 14, 19, 14, 38, 15,…
## $ outcome <dbl> 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1,…
Assign numeric and non-numeric fields into variables for reference later.
sel_num <- df %>% select_if(is_double) %>% select(sort(names(.))) %>% colnames()
sel_cat <- df %>% select(!one_of(sel_num)) %>% colnames()
Performing some required data transformations for the
response and outcome fields to facilitate
processing.
df_eda <- df %>% mutate(
log_response = log(response),
binary_outcome = ifelse(outcome=="event", 1, 0))
Get a summary of the categorical variables Lightness and Saturation, as well as the binary response. Check missing values, unique values.
summarize_cate_var <- function(df) {
tibble(
variable = names(df),
n_missing = map_dbl(df, ~sum(is.na(.))),
n_levels = map_dbl(df, n_distinct),
) %>% mutate(
percent_missing = n_missing/nrow(df)*100,
)
}
summ_cate <- df_eda %>% select(all_of(sel_cat)) %>% summarize_cate_var()
summ_cate %>% knitr::kable(caption = "categorical variables overview")
| variable | n_missing | n_levels | percent_missing |
|---|---|---|---|
| Lightness | 0 | 7 | 0 |
| Saturation | 0 | 7 | 0 |
Count the unique levels for each category and sort by count for each category.
df_eda %>% select(sel_cat) %>% pivot_longer(everything(), names_to ="var", values_to="level") %>%
count(var,level) %>% arrange(var,n) %>%
knitr::kable(caption = "categorical variables count")
## Warning: Using an external vector in selections was deprecated in tidyselect 1.1.0.
## ℹ Please use `all_of()` or `any_of()` instead.
## # Was:
## data %>% select(sel_cat)
##
## # Now:
## data %>% select(all_of(sel_cat))
##
## See <https://tidyselect.r-lib.org/reference/faq-external-vector.html>.
## This warning is displayed once every 8 hours.
## Call `lifecycle::last_lifecycle_warnings()` to see where this warning was
## generated.
| var | level | n |
|---|---|---|
| Lightness | dark | 117 |
| Lightness | deep | 119 |
| Lightness | midtone | 119 |
| Lightness | saturated | 119 |
| Lightness | light | 120 |
| Lightness | soft | 120 |
| Lightness | pale | 121 |
| Saturation | gray | 83 |
| Saturation | neutral | 122 |
| Saturation | bright | 126 |
| Saturation | muted | 126 |
| Saturation | pure | 126 |
| Saturation | shaded | 126 |
| Saturation | subdued | 126 |
Use barchart to visualize the distribution of categorical input and binary response variables.
df_eda %>% select(sel_cat) %>%
pivot_longer(everything()) %>%
ggplot() +
geom_bar(aes(x=value, fill=value)) +
facet_grid(~name, scales="free_x", space="free_x") +
xlab("") + theme(legend.position="none")
Get a high level summary of the numeric variables. Check missing values, unique values, range and median.
summarize_nume_var <- function(df) {
tibble(
variable = names(df),
n_missing = map_dbl(df, ~sum(is.na(.))),
n_unique = map_dbl(df, n_distinct),
min = map_dbl(df, min),
median = map_dbl(df, median),
max = map_dbl(df, max)
) %>% mutate(
percent_missing = n_missing/nrow(df)*100,
percent_unique = n_unique/nrow(df)*100,
)
}
summ_nume <- summarize_nume_var(df %>% select(all_of(sel_num)))
summ_nume %>% knitr::kable(caption = "numeric variables overview")
| variable | n_missing | n_unique | min | median | max | percent_missing | percent_unique |
|---|---|---|---|---|---|---|---|
| B | 0 | 183 | 47 | 170 | 238 | 0 | 21.916168 |
| G | 0 | 171 | 58 | 188 | 241 | 0 | 20.479042 |
| Hue | 0 | 36 | 1 | 17 | 36 | 0 | 4.311377 |
| outcome | 0 | 2 | 0 | 0 | 1 | 0 | 0.239521 |
| R | 0 | 197 | 0 | 203 | 255 | 0 | 23.592814 |
| response | 0 | 83 | 6 | 51 | 87 | 0 | 9.940120 |
Examine the distribution of the continuous response.
ggplot(df_eda, aes(x = R, fill = "R")) +
geom_histogram(position = "identity", alpha = 0.7, bins = 30) +
geom_histogram(aes(x = G, fill = "G"), position = "identity", alpha = 0.7, bins = 30) +
geom_histogram(aes(x = B, fill = "B"), position = "identity", alpha = 0.7, bins = 30) +
geom_histogram(aes(x = Hue, fill = "Hue"), position = "identity", alpha = 0.7, bins = 30) +
labs(title = "Combined Histogram for R, G, B, Hue",
x = "Values",
y = "Frequency") +
scale_fill_manual(values = c("R" = "blue", "G" = "green", "B" = "red", "Hue" = "purple")) +
theme_minimal()
The distribution of continuous variables seems gaussian.
#Condition (group) the continuous variables based on the categorical variables.
# Boxplot for R grouped by Lightness and Saturation
ggplot(df, aes(x = Lightness, y = R, fill = Saturation)) +
geom_boxplot() +
labs(title = "Boxplot of R by Lightness and Saturation")
ggplot(df, aes(x = Lightness, y = G, fill = Saturation)) +
geom_boxplot() +
labs(title = "Boxplot of G by Lightness and Saturation")
ggplot(df, aes(x = Lightness, y = B, fill = Saturation)) +
geom_boxplot() +
labs(title = "Boxplot of B by Lightness and Saturation")
ggplot(df, aes(x = Lightness, y = Hue, fill = Saturation)) +
geom_boxplot() +
labs(title = "Boxplot of Hue by Lightness and Saturation")
###Visualize the relationships between the continuous inputs, are they correlated?
Check correlation between continuous input variables.
df_eda %>% select(all_of(sel_num)) %>% select(-response) %>%
cor() %>%
corrplot::corrplot(type="upper", method="color", diag=F)
#Visualize the relationships between the continuous outputs (response and the LOGIT-transformed response, y) with respect to the continuous INPUTS.
Examine the response with regard to every continuous input.
library(plotly)
##
## Attaching package: 'plotly'
## The following object is masked from 'package:ggplot2':
##
## last_plot
## The following object is masked from 'package:stats':
##
## filter
## The following object is masked from 'package:graphics':
##
## layout
scatterplot <- plot_ly(data = df_eda, type = "scatter", mode = "markers", alpha = 0.7) %>%
add_trace(x = ~R, y = ~response, name = "R") %>%
add_trace(x = ~G, y = ~response, name = "G") %>%
add_trace(x = ~B, y = ~response, name = "B") %>%
add_trace(x = ~Hue, y = ~response, name = "Hue") %>%
plotly::layout(title = "Scatterplot Matrix of Continuous Inputs vs. Response",
showlegend = TRUE,
legend = list(x = 0.5, y = 1, traceorder = "normal", font = list(family = "sans-serif", size = 12, color = "black")))
# Show the plot
scatterplot
response.
response.response to remove lower
bound for modelling, will examine again at log-transformed scale.Examine the log-transformed response with regard to every continuous input.
scatterplot <- plot_ly(data = df_eda, type = "scatter", mode = "markers", alpha = 0.7) %>%
add_trace(x = ~R, y = ~log_response, name = "R") %>%
add_trace(x = ~G, y = ~log_response, name = "G") %>%
add_trace(x = ~B, y = ~log_response, name = "B") %>%
add_trace(x = ~Hue, y = ~log_response, name = "Hue") %>%
plotly::layout(title = "Scatterplot Matrix of Continuous Inputs vs. Response",
showlegend = TRUE,
legend = list(x = 0.5, y = 1, traceorder = "normal", font = list(family = "sans-serif", size = 12, color = "black")))
# Show the plot
scatterplot
###Visualize the behavior of the binary outcome with respect to the continuous inputs
df_long <- gather(df_eda, key = "variable", value = "value", R, G, B, Hue)
ggplot(df_long, aes(x = value, y = outcome, color = outcome)) +
geom_point() +
facet_wrap(~variable, scales = "free") +
labs(title = "Scatter Plots of Continuous Inputs vs. Binary Outcome",
x = "Variable",
y = "Binary Outcome")
#visualize the behavior of the binary outcome with respect to the
categorical INPUTS?
df_long <- gather(df_eda, key = "variable", value = "value", Lightness,Saturation)
ggplot(df_long, aes(x = value, y = outcome, color = outcome)) +
geom_point() +
facet_wrap(~variable, scales = "free") +
labs(title = "Scatter Plots of Continuous Inputs vs. Binary Outcome",
x = "Variable",
y = "Binary Outcome")